import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
db_2009 = pd.read_json('data-sample_data-nyctaxi-trips-2009-json_corrigido.json', lines=True)
db_2010 = pd.read_json('data-sample_data-nyctaxi-trips-2010-json_corrigido.json', lines=True)
db_2011 = pd.read_json('data-sample_data-nyctaxi-trips-2011-json_corrigido.json', lines=True)
db_2012 = pd.read_json('data-sample_data-nyctaxi-trips-2012-json_corrigido.json', lines=True)
print(db_2009['vendor_id'].unique())
print(db_2010['vendor_id'].unique())
print(db_2011['vendor_id'].unique())
print(db_2012['vendor_id'].unique())
db_pay = pd.read_csv('data-payment_lookup-csv.csv', header=1)
db_vendor = pd.read_csv('data-vendor_lookup-csv.csv', header=0)
db_pay.head()
db_vendor.head()
db_2009.columns
db_2010.columns
db_2011.columns
db_2012.columns
# Quais os 3 maiores ​ vendors ​ em quantidade total de dinheiro arrecadado;
print('2009: ', db_2009.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
print(' ')
print('2010: ', db_2010.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
print(' ')
print('2011: ', db_2011.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
print(' ')
print('2012: ',db_2012.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
db_2009 = pd.DataFrame(db_2009)
db_2010 = pd.DataFrame(db_2010)
db_2011 = pd.DataFrame(db_2011)
db_2012 = pd.DataFrame(db_2012)
db_complete = pd.concat([db_2009,db_2010,db_2011,db_2012])
print('2009 - 2012: ', db_complete.groupby(['vendor_id']).sum()['total_amount'].sort_values(ascending=[False]).head(3))
# Qual a distância média percorrida por viagens com no máximo 2 passageiros;
print('distancia média para 2 ou menos passageiros:')
print(db_complete['trip_distance'][db_complete['passenger_count'] <= 2].mean())
# Faça um histograma da distribuição mensal, nos 4 anos, de corridas pagas em dinheiro;
db_complete.head()
db_complete['payment_type'].unique()
db_2009['dropoff_datetime'] = pd.to_datetime(db_2009['dropoff_datetime'])
nov_mask_2009 = db_2009['dropoff_datetime'].map(lambda x: x.month)
db_2009['dropoff_months'] = nov_mask_2009
x = db_2009['dropoff_months'][db_2009['payment_type'] == 'Cash']
y = db_2009['dropoff_months'][db_2009['payment_type'] == 'CASH']
filter_trip_2009 = pd.concat([x,y])
db_2010['dropoff_datetime'] = pd.to_datetime(db_2010['dropoff_datetime'])
nov_mask_2010 = db_2010['dropoff_datetime'].map(lambda x: x.month)
db_2010['dropoff_months'] = nov_mask_2010
x = db_2010['dropoff_months'][db_2010['payment_type'] == 'Cash']
y = db_2010['dropoff_months'][db_2010['payment_type'] == 'CASH']
filter_trip_2010 = pd.concat([x,y])
db_2011['dropoff_datetime'] = pd.to_datetime(db_2011['dropoff_datetime'])
nov_mask_2011 = db_2011['dropoff_datetime'].map(lambda x: x.month)
db_2011['dropoff_months'] = nov_mask_2011
x = db_2011['dropoff_months'][db_2011['payment_type'] == 'Cash']
y = db_2011['dropoff_months'][db_2011['payment_type'] == 'CASH']
filter_trip_2011 = pd.concat([x,y])
db_2012['dropoff_datetime'] = pd.to_datetime(db_2012['dropoff_datetime'])
nov_mask_2012 = db_2012['dropoff_datetime'].map(lambda x: x.month)
db_2012['dropoff_months'] = nov_mask_2012
x = db_2012['dropoff_months'][db_2012['payment_type'] == 'Cash']
y = db_2012['dropoff_months'][db_2012['payment_type'] == 'CASH']
filter_trip_2012 = pd.concat([x,y])
## Faça um histograma da distribuição mensal, nos 4 anos, de corridas pagas em dinheiro;
plt.figure(figsize=(20,10))
plt.hist(filter_trip_2009, color='g', alpha = 0.5, label=2009)
plt.hist(filter_trip_2010, color='r', alpha = 0.5, label=2010)
plt.hist(filter_trip_2011, color='b', alpha = 0.5, label=2011)
plt.hist(filter_trip_2012, color='y', alpha = 0.5, label=2012)
plt.xlabel('Months')
plt.ylabel('Number of trips')
plt.title('Monthly distribution of cash payed trips')
plt.legend()
plt.show()
db_2012['dropoff_datetime'] = pd.to_datetime(db_2012['dropoff_datetime'])
nov_days_2012 = db_2012['dropoff_datetime'].map(lambda x: x.month)
db_2012['dropoff_days'] = nov_mask_2012
# separando os ultimos três meses
last_three_months = db_2012[db_2012['dropoff_months'] >= 8]
last_three_months['dropoff_months'].unique()
last_three_months
tip_last_90 = last_three_months['tip_amount'][last_three_months['dropoff_days'] != 'NaN']
tip_last_90
# Faça um gráfico de série temporal contando a quantidade de gorjetas de cada dia, nos
# últimos 3 meses de 2012.
plt.figure(figsize=(20,10))
plt.plot(tip_last_90)
plt.ylabel('tip_amount')
plt.xlabel('days')
plt.title('Amount of tips last 3 months')
plt.show()
!pip install plotly==4.5.0
!pip install the chart-studio
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
token = 'https://api.mapbox.com/styles/v1/filipescm/ck6gqq8bx3ljh1ip6xr1dwkkv.html?fresh=true&title=copy&access_token=pk.eyJ1IjoiZmlsaXBlc2NtIiwiYSI6ImNrNmdxbzRkNDBtdTUza210ZDdwMWFlczUifQ.u6sDxXHhFYrgvbEM0cHUJw'
fig = go.Figure(go.Scattermapbox(
lat=db_2010['pickup_latitude'],
lon=db_2010['pickup_longitude'],
mode='markers',
marker=go.scattermapbox.Marker(
size=14
),
text=['Pickup Points'],
))
fig.update_layout(
hovermode='closest',
mapbox=go.layout.Mapbox(
accesstoken=token,
bearing=0,
center=go.layout.mapbox.Center(
lat=40.783,
lon=-73.966
),
pitch=0,
zoom=5
)
)
fig.show()